{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/admin/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from utils import GeneSeg\n",
    "import csv,pickle,random,json\n",
    "import numpy as np\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils import to_categorical\n",
    "from sklearn.model_selection import train_test_split\n",
    "import tensorflow as tf\n",
    "\n",
    "vec_dir=\"/home/admin/xssdetection/file/word2vec.pickle\"\n",
    "pre_datas_train=\"/home/admin/xssdetection/file/pre_datas_train.csv\"\n",
    "pre_datas_test=\"/home/admin/xssdetection/file/pre_datas_test.csv\"\n",
    "process_datas_dir=\"/home/admin/xssdetection/file/process_datas.pickle\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pre_process():\n",
    "    with open(vec_dir,\"rb\") as f :\n",
    "        word2vec=pickle.load(f)\n",
    "        dictionary=word2vec[\"dictionary\"]\n",
    "        reverse_dictionary=word2vec[\"reverse_dictionary\"]\n",
    "        embeddings=word2vec[\"embeddings\"]\n",
    "    xssed_data=[]\n",
    "    normal_data=[]\n",
    "    with open(\"/home/admin/xssdetection/data/xssed.csv\",\"r\",encoding=\"utf-8\") as f:\n",
    "        reader = csv.DictReader(f, fieldnames=[\"payload\"])\n",
    "        for row in reader:\n",
    "            payload=row[\"payload\"]\n",
    "            word=GeneSeg(payload)\n",
    "            xssed_data.append(word)\n",
    "    with open(\"/home/admin/xssdetection/data/normal_examples.csv\",\"r\",encoding=\"utf-8\") as f:\n",
    "        reader=csv.reader(f)\n",
    "        reader = csv.DictReader(f, fieldnames=[\"payload\"])\n",
    "        for row in reader:\n",
    "            payload=row[\"payload\"]\n",
    "            word=GeneSeg(payload)\n",
    "            normal_data.append(word)\n",
    "    xssed_num=len(xssed_data)\n",
    "    normal_num=len(normal_data)\n",
    "    xssed_labels=[1]*xssed_num\n",
    "    normal_labels=[0]*normal_num\n",
    "    datas=xssed_data+normal_data\n",
    "    labels=xssed_labels+normal_labels\n",
    "    labels=to_categorical(labels)\n",
    "    def to_index(data):\n",
    "        d_index=[]\n",
    "        for word in data:\n",
    "            if word in dictionary.keys():\n",
    "                d_index.append(dictionary[word])\n",
    "            else:\n",
    "                d_index.append(dictionary[\"UNK\"])\n",
    "        return d_index\n",
    "    datas_index=[to_index(data) for data in datas]\n",
    "    #print(datas_index[0:100])\n",
    "    datas_index=pad_sequences(datas_index,value=-1)\n",
    "    print(type(datas_index),type(datas_index[0]))\n",
    "    rand=random.sample(range(len(datas_index)),len(datas_index))\n",
    "    datas=[datas_index[index] for index in rand]\n",
    "    labels=[labels[index] for index in rand]\n",
    "    train_datas,test_datas,train_labels,test_labels=train_test_split(datas,labels,test_size=0.3)\n",
    "    print(type(train_datas),type(train_datas[0]))\n",
    "    print(np.shape(train_datas))\n",
    "    print(embeddings[\"UNK\"].shape)\n",
    "    train_size=len(train_labels)\n",
    "    test_size=len(test_labels)\n",
    "    input_num=len(train_datas[0])\n",
    "    dims_num = embeddings[\"UNK\"].shape[0]\n",
    "    word2vec[\"train_size\"]=train_size\n",
    "    word2vec[\"test_size\"]=test_size\n",
    "    word2vec[\"input_num\"]=input_num\n",
    "    word2vec[\"dims_num\"]=dims_num\n",
    "    with open(vec_dir,\"wb\") as f :\n",
    "        pickle.dump(word2vec,f)\n",
    "    print(\"Saved word2vec to:\",vec_dir)\n",
    "    print(\"Write trian datas to:\",pre_datas_train)\n",
    "    with open(pre_datas_train,\"w\") as f:\n",
    "        for i in range(train_size):\n",
    "            data_line=str(train_datas[i].tolist())+\"|\"+str(train_labels[i].tolist())+\"\\n\"\n",
    "            f.write(data_line)\n",
    "    print(\"Write test datas to:\",pre_datas_test)\n",
    "    with open(pre_datas_test,\"w\") as f:\n",
    "        for i in range(test_size):\n",
    "            data_line=str(test_datas[i].tolist())+\"|\"+str(test_labels[i].tolist())+\"\\n\"\n",
    "            f.write(data_line)\n",
    "    print(\"Write datas over!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'numpy.ndarray'> <class 'numpy.ndarray'>\n",
      "<class 'list'> <class 'numpy.ndarray'>\n",
      "(168536, 532)\n",
      "(128,)\n",
      "Saved word2vec to: /home/admin/xssdetection/file/word2vec.pickle\n",
      "Write trian datas to: /home/admin/xssdetection/file/pre_datas_train.csv\n",
      "Write test datas to: /home/admin/xssdetection/file/pre_datas_test.csv\n",
      "Write datas over!\n"
     ]
    }
   ],
   "source": [
    "pre_process()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_generator(data_dir):\n",
    "    reader = tf.TextLineReader()\n",
    "    queue = tf.train.string_input_producer([data_dir])\n",
    "    _, value = reader.read(queue)\n",
    "    coord = tf.train.Coordinator()\n",
    "    sess = tf.Session()\n",
    "    threads = tf.train.start_queue_runners(sess=sess, coord=coord)\n",
    "    while True:\n",
    "        v = sess.run(value)\n",
    "        [data, label] = v.split(b\"|\")\n",
    "        data = np.array(json.loads(data.decode(\"utf-8\")))\n",
    "        label = np.array(json.loads(label.decode(\"utf-8\")))\n",
    "        yield (data, label)\n",
    "    coord.request_stop()\n",
    "    coord.join(threads)\n",
    "    sess.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "generator=data_generator(pre_datas_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,\n",
       "         -1,  -1,  -1, 546,   1, 389, 929, 176,   1,   1, 929, 267,   0,\n",
       "          1,   0,   1,   0,   1,   0,   1,   0,   1, 523,   1, 523]),\n",
       " array([1., 0.]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(generator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def batch_generator(datas_dir,datas_size,batch_size,embeddings,reverse_dictionary,train=True):\n",
    "    batch_data = []\n",
    "    batch_label = []\n",
    "    generator=data_generator(datas_dir)\n",
    "    n=0\n",
    "    while True:\n",
    "        for i in range(batch_size):\n",
    "            data,label=next(generator)\n",
    "            data_embed = []\n",
    "            for d in data:\n",
    "                if d != -1:\n",
    "                    data_embed.append(embeddings[reverse_dictionary[d]])\n",
    "                else:\n",
    "                    data_embed.append([0.0] * len(embeddings[\"UNK\"]))\n",
    "            batch_data.append(data_embed)\n",
    "            batch_label.append(label)\n",
    "            n+=1\n",
    "            if not train and n==datas_size:\n",
    "                break\n",
    "        if not train and n == datas_size:\n",
    "            yield (np.array(batch_data), np.array(batch_label))\n",
    "            break\n",
    "        else:\n",
    "            yield (np.array(batch_data),np.array(batch_label))\n",
    "            batch_data = []\n",
    "            batch_label = []\n",
    "def build_dataset(batch_size):\n",
    "    with open(vec_dir, \"rb\") as f:\n",
    "        word2vec = pickle.load(f)\n",
    "    embeddings = word2vec[\"embeddings\"]\n",
    "    reverse_dictionary = word2vec[\"reverse_dictionary\"]\n",
    "    train_size=word2vec[\"train_size\"]\n",
    "    test_size=word2vec[\"test_size\"]\n",
    "    dims_num = word2vec[\"dims_num\"]\n",
    "    input_num =word2vec[\"input_num\"]\n",
    "    train_generator = batch_generator(pre_datas_train,train_size,batch_size,embeddings,reverse_dictionary)\n",
    "    test_generator = batch_generator(pre_datas_test,test_size,batch_size,embeddings,reverse_dictionary,train=False)\n",
    "    return train_generator,test_generator,train_size,test_size,input_num,dims_num\n",
    "\n",
    "train_generator, test_generator, train_size, test_size, input_num, dims_num=build_dataset(500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500, 168536, 72230, 532, 128)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(next(train_generator)[0]),train_size,test_size,input_num,dims_num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense,InputLayer,Dropout,LSTM\n",
    "from keras.optimizers import Adam\n",
    "batch_size=500\n",
    "epochs_num=1\n",
    "process_datas_dir=\"/home/admin/xssdetection/file/process_datas.pickle\"\n",
    "model_dir=\"/home/admin/xssdetection/file/LSTM_model\"\n",
    "def train(train_generator,train_size,input_num,dims_num):\n",
    "    print(\"Start Train Job! \")\n",
    "    start=time.time()\n",
    "    ######inputs=InputLayer(input_shape=(input_num,dims_num),batch_size=batch_size)###why add the line cause model sava failed?\n",
    "    layer1=LSTM(16,input_shape=(input_num,dims_num),batch_size=batch_size)\n",
    "    output=Dense(2,activation=\"softmax\",name=\"Output\")\n",
    "    model=Sequential()\n",
    "    #model.add(inputs)\n",
    "    model.add(layer1)\n",
    "    model.add(output)\n",
    "    model.summary()\n",
    "    model.compile(optimizer='rmsprop',loss=\"categorical_crossentropy\",metrics=[\"accuracy\"])\n",
    "    model.fit_generator(train_generator,steps_per_epoch=train_size//batch_size,epochs=epochs_num)\n",
    "#    model.fit_generator(train_generator, steps_per_epoch=5, epochs=5, callbacks=[call])\n",
    "    end=time.time()\n",
    "    print(\"Over train job in %f s\"%(end-start))\n",
    "    return model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start Train Job! \n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "lstm_1 (LSTM)                (500, 16)                 9280      \n",
      "_________________________________________________________________\n",
      "Output (Dense)               (500, 2)                  34        \n",
      "=================================================================\n",
      "Total params: 9,314\n",
      "Trainable params: 9,314\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "Epoch 1/1\n",
      "337/337 [==============================] - 1359s 4s/step - loss: 0.0402 - acc: 0.9901\n",
      "Over train job in 1361.460702 s\n"
     ]
    }
   ],
   "source": [
    "model=train(train_generator,train_size,input_num,dims_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500, 2)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict_generator(test_generator,steps=1).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from keras.models import model_from_json\n",
    "filename = './file/xss_lstm.h5'\n",
    "model.save(filename)\n",
    "#model.save_weights(filename)\n",
    "#model.load_weights(filename,by_name=True)\n",
    "#json_string = model.to_json()\n",
    "#model = model_from_json(json_string)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[9.9938452e-01, 6.1541318e-04],\n",
       "       [9.9871635e-01, 1.2836576e-03],\n",
       "       [9.9899739e-01, 1.0026500e-03],\n",
       "       ...,\n",
       "       [4.1994863e-04, 9.9958009e-01],\n",
       "       [9.9857724e-01, 1.4227253e-03],\n",
       "       [9.9821019e-01, 1.7897724e-03]], dtype=float32)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from keras.models import load_model\n",
    "model=load_model('./file/xss_lstm.h5')\n",
    "model.predict_generator(test_generator,steps=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
